home *** CD-ROM | disk | FTP | other *** search
- Path: xanth!ames!mailrus!ulowell!page
- From: page@swan.ulowell.edu (Bob Page)
- Newsgroups: comp.sources.amiga
- Subject: v02i085: regexp - regular-expression routines, Part01/02
- Message-ID: <10484@swan.ulowell.edu>
- Date: 5 Dec 88 22:05:51 GMT
- Organization: University of Lowell, Computer Science Dept.
- Lines: 737
- Approved: page@swan.ulowell.edu
-
- Submitted-by: grwalter@watcgl.waterloo.edu
- Posting-number: Volume 2, Issue 85
- Archive-name: unix/regexp.1
-
- This is a reimplementation of the Unix V8 regexp(3) package. It gives
- C programs the ability to use egrep-style regular expressions, and
- does it in a much cleaner fashion than the analogous routines in SysV.
-
- [You need this to recompile stevie. ..Bob]
-
- # This is a shell archive.
- # Remove everything above and including the cut line.
- # Then run the rest of the file through sh.
- #----cut here-----cut here-----cut here-----cut here----#
- #!/bin/sh
- # shar: Shell Archiver
- # Run the following text with /bin/sh to create:
- # MAKE
- # Makefile
- # README
- # tests
- # timer.c
- # try.c
- # This archive created: Mon Nov 28 19:50:49 1988
- cat << \SHAR_EOF > MAKE
- lc -M -Rregexp.lib regexp.c regsub.c regerror.c
- SHAR_EOF
- cat << \SHAR_EOF > Makefile
- # Things you might want to put in ENV and LENV:
- # -Dvoid=int compilers that don't do void
- # -DCHARBITS=0377 compilers that don't do unsigned char
- # -DSTATIC=extern compilers that don't like "static foo();" as forward decl
- # -DSTRCSPN library does not have strcspn()
- # -Dstrchr=index library does not have strchr()
- # -DERRAVAIL have utzoo-compatible error() function and friends
- ENV=-DSTRCSPN
- LENV=-DSTRCSPN
-
- # Things you might want to put in TEST:
- # -DDEBUG debugging hooks
- # -I. regexp.h from current directory, not /usr/include
- TEST=-I.
-
- # Things you might want to put in PROF:
- # -Dstatic='/* */' make everything global so profiler can see it.
- # -p profiler
- PROF=
-
- CFLAGS=-O $(ENV) $(TEST) $(PROF)
- LINTFLAGS=$(LENV) $(TEST) -ha
- LDFLAGS=-i
-
- OBJ=regexp.o regsub.o
- LSRC=regexp.c regsub.c regerror.c
- DTR=README dMakefile regexp.3 regexp.h regexp.c regsub.c regerror.c \
- regmagic.h try.c timer.c tests
- DEST = ..
-
- try: try.o $(OBJ)
- cc $(LDFLAGS) try.o $(OBJ) -o try
-
- # Making timer will probably require putting stuff in $(PROF) and then
- # recompiling everything; the following is just the final stage.
- timer: timer.o $(OBJ)
- cc $(LDFLAGS) $(PROF) timer.o $(OBJ) -o timer
-
- timer.o: timer.c timer.t.h
-
- timer.t.h: tests
- sed 's/ /","/g;s/\\/&&/g;s/.*/{"&"},/' tests >timer.t.h
-
- # Regression test.
- r: try tests
- @echo 'No news is good news...'
- try <tests
-
- lint: timer.t.h
- @echo 'Complaints about multiply-declared regerror() are legit.'
- lint $(LINTFLAGS) $(LSRC) try.c
- lint $(LINTFLAGS) $(LSRC) timer.c
-
- regexp.o: regexp.c regexp.h regmagic.h
- regsub.o: regsub.c regexp.h regmagic.h
-
- clean:
- rm -f *.o *.out core mon.out timer.t.h dMakefile dtr try timer
-
- dtr: r makedtr $(DTR)
- makedtr $(DTR) >dtr
-
- dMakefile: Makefile
- sed '/^L*ENV=/s/ *-DERRAVAIL//' Makefile >dMakefile
-
- mv: $(OBJ) regerror.o
- mv $(OBJ) regerror.o $(DEST)
- SHAR_EOF
- cat << \SHAR_EOF > README
- This is a nearly-public-domain reimplementation of the V8 regexp(3) package.
- It gives C programs the ability to use egrep-style regular expressions, and
- does it in a much cleaner fashion than the analogous routines in SysV.
-
- Copyright (c) 1986 by University of Toronto.
- Written by Henry Spencer. Not derived from licensed software.
-
- Permission is granted to anyone to use this software for any
- purpose on any computer system, and to redistribute it freely,
- subject to the following restrictions:
-
- 1. The author is not responsible for the consequences of use of
- this software, no matter how awful, even if they arise
- from defects in it.
-
- 2. The origin of this software must not be misrepresented, either
- by explicit claim or by omission.
-
- 3. Altered versions must be plainly marked as such, and must not
- be misrepresented as being the original software.
-
- Barring a couple of small items in the BUGS list, this implementation is
- believed 100% compatible with V8. It should even be binary-compatible,
- sort of, since the only fields in a "struct regexp" that other people have
- any business touching are declared in exactly the same way at the same
- location in the struct (the beginning).
-
- This implementation is *NOT* AT&T/Bell code, and is not derived from licensed
- software. Even though U of T is a V8 licensee. This software is based on
- a V8 manual page sent to me by Dennis Ritchie (the manual page enclosed
- here is a complete rewrite and hence is not covered by AT&T copyright).
- The software was nearly complete at the time of arrival of our V8 tape.
- I haven't even looked at V8 yet, although a friend elsewhere at U of T has
- been kind enough to run a few test programs using the V8 regexp(3) to resolve
- a few fine points. I admit to some familiarity with regular-expression
- implementations of the past, but the only one that this code traces any
- ancestry to is the one published in Kernighan & Plauger (from which this
- one draws ideas but not code).
-
- Simplistically: put this stuff into a source directory, copy regexp.h into
- /usr/include, inspect Makefile for compilation options that need changing
- to suit your local environment, and then do "make r". This compiles the
- regexp(3) functions, compiles a test program, and runs a large set of
- regression tests. If there are no complaints, then put regexp.o, regsub.o,
- and regerror.o into your C library, and regexp.3 into your manual-pages
- directory.
-
- Note that if you don't put regexp.h into /usr/include *before* compiling,
- you'll have to add "-I." to CFLAGS before compiling.
-
- The files are:
-
- Makefile instructions to make everything
- regexp.3 manual page
- regexp.h header file, for /usr/include
- regexp.c source for regcomp() and regexec()
- regsub.c source for regsub()
- regerror.c source for default regerror()
- regmagic.h internal header file
- try.c source for test program
- timer.c source for timing program
- tests test list for try and timer
-
- This implementation uses nondeterministic automata rather than the
- deterministic ones found in some other implementations, which makes it
- simpler, smaller, and faster at compiling regular expressions, but slower
- at executing them. In theory, anyway. This implementation does employ
- some special-case optimizations to make the simpler cases (which do make
- up the bulk of regular expressions actually used) run quickly. In general,
- if you want blazing speed you're in the wrong place. Replacing the insides
- of egrep with this stuff is probably a mistake; if you want your own egrep
- you're going to have to do a lot more work. But if you want to use regular
- expressions a little bit in something else, you're in luck. Note that many
- existing text editors use nondeterministic regular-expression implementations,
- so you're in good company.
-
- This stuff should be pretty portable, given appropriate option settings.
- If your chars have less than 8 bits, you're going to have to change the
- internal representation of the automaton, although knowledge of the details
- of this is fairly localized. There are no "reserved" char values except for
- NUL, and no special significance is attached to the top bit of chars.
- The string(3) functions are used a fair bit, on the grounds that they are
- probably faster than coding the operations in line. Some attempts at code
- tuning have been made, but this is invariably a bit machine-specific.
- SHAR_EOF
- cat << \SHAR_EOF > tests
- abc abc y & abc
- abc xbc n - -
- abc axc n - -
- abc abx n - -
- abc xabcy y & abc
- abc ababc y & abc
- ab*c abc y & abc
- ab*bc abc y & abc
- ab*bc abbc y & abbc
- ab*bc abbbbc y & abbbbc
- ab+bc abbc y & abbc
- ab+bc abc n - -
- ab+bc abq n - -
- ab+bc abbbbc y & abbbbc
- ab?bc abbc y & abbc
- ab?bc abc y & abc
- ab?bc abbbbc n - -
- ab?c abc y & abc
- ^abc$ abc y & abc
- ^abc$ abcc n - -
- ^abc abcc y & abc
- ^abc$ aabc n - -
- abc$ aabc y & abc
- ^ abc y &
- $ abc y &
- a.c abc y & abc
- a.c axc y & axc
- a.*c axyzc y & axyzc
- a.*c axyzd n - -
- a[bc]d abc n - -
- a[bc]d abd y & abd
- a[b-d]e abd n - -
- a[b-d]e ace y & ace
- a[b-d] aac y & ac
- a[-b] a- y & a-
- a[b-] a- y & a-
- [k] ab n - -
- a[b-a] - c - -
- a[]b - c - -
- a[ - c - -
- a] a] y & a]
- a[]]b a]b y & a]b
- a[^bc]d aed y & aed
- a[^bc]d abd n - -
- a[^-b]c adc y & adc
- a[^-b]c a-c n - -
- a[^]b]c a]c n - -
- a[^]b]c adc y & adc
- ab|cd abc y & ab
- ab|cd abcd y & ab
- ()ef def y &-\1 ef-
- ()* - c - -
- *a - c - -
- ^* - c - -
- $* - c - -
- (*)b - c - -
- $b b n - -
- a\ - c - -
- a\(b a(b y &-\1 a(b-
- a\(*b ab y & ab
- a\(*b a((b y & a((b
- a\\b a\b y & a\b
- abc) - c - -
- (abc - c - -
- ((a)) abc y &-\1-\2 a-a-a
- (a)b(c) abc y &-\1-\2 abc-a-c
- a+b+c aabbabc y & abc
- a** - c - -
- a*? - c - -
- (a*)* - c - -
- (a*)+ - c - -
- (a|)* - c - -
- (a*|b)* - c - -
- (a+|b)* ab y &-\1 ab-b
- (a+|b)+ ab y &-\1 ab-b
- (a+|b)? ab y &-\1 a-a
- [^ab]* cde y & cde
- (^)* - c - -
- (ab|)* - c - -
- )( - c - -
- abc y &
- abc n - -
- a* y &
- abcd abcd y &-\&-\\& abcd-&-\abcd
- a(bc)d abcd y \1-\\1-\\\1 bc-\1-\bc
- ([abc])*d abbbcd y &-\1 abbbcd-c
- ([abc])*bcd abcd y &-\1 abcd-a
- a|b|c|d|e e y & e
- (a|b|c|d|e)f ef y &-\1 ef-e
- ((a*|b))* - c - -
- abcd*efg abcdefg y & abcdefg
- ab* xabyabbbz y & ab
- ab* xayabbbz y & a
- (ab|cd)e abcde y &-\1 cde-cd
- [abhgefdc]ij hij y & hij
- ^(ab|cd)e abcde n x\1y xy
- (abc|)ef abcdef y &-\1 ef-
- (a|b)c*d abcd y &-\1 bcd-b
- (ab|ab*)bc abc y &-\1 abc-a
- a([bc]*)c* abc y &-\1 abc-bc
- a([bc]*)(c*d) abcd y &-\1-\2 abcd-bc-d
- a([bc]+)(c*d) abcd y &-\1-\2 abcd-bc-d
- a([bc]*)(c+d) abcd y &-\1-\2 abcd-b-cd
- a[bcd]*dcdcde adcdcde y & adcdcde
- a[bcd]+dcdcde adcdcde n - -
- (ab|a)b*c abc y &-\1 abc-ab
- ((a)(b)c)(d) abcd y \1-\2-\3-\4 abc-a-b-d
- [ -~]* abc y & abc
- [ -~ -~]* abc y & abc
- [ -~ -~ -~]* abc y & abc
- [ -~ -~ -~ -~]* abc y & abc
- [ -~ -~ -~ -~ -~]* abc y & abc
- [ -~ -~ -~ -~ -~ -~]* abc y & abc
- [ -~ -~ -~ -~ -~ -~ -~]* abc y & abc
- [a-zA-Z_][a-zA-Z0-9_]* alpha y & alpha
- ^a(bc+|b[eh])g|.h$ abh y &-\1 bh-
- (bc+d$|ef*g.|h?i(j|k)) effgz y &-\1-\2 effgz-effgz-
- (bc+d$|ef*g.|h?i(j|k)) ij y &-\1-\2 ij-ij-j
- (bc+d$|ef*g.|h?i(j|k)) effg n - -
- (bc+d$|ef*g.|h?i(j|k)) bcdd n - -
- (bc+d$|ef*g.|h?i(j|k)) reffgz y &-\1-\2 effgz-effgz-
- ((((((((((a)))))))))) - c - -
- (((((((((a))))))))) a y & a
- multiple words of text uh-uh n - -
- multiple words multiple words, yeah y & multiple words
- (.*)c(.*) abcde y &-\1-\2 abcde-ab-de
- \((.*), (.*)\) (a, b) y (\2, \1) (b, a)
- SHAR_EOF
- cat << \SHAR_EOF > timer.c
- /*
- * Simple timing program for regcomp().
- *
- * Copyright (c) 1986 by University of Toronto. Written by Henry Spencer. Not
- * derived from licensed software.
- *
- * Permission is granted to anyone to use this software for any purpose on any
- * computer system, and to redistribute it freely, subject to the following
- * restrictions:
- *
- * 1. The author is not responsible for the consequences of use of this
- * software, no matter how awful, even if they arise from defects in it.
- *
- * 2. The origin of this software must not be misrepresented, either by explicit
- * claim or by omission.
- *
- * 3. Altered versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- *
- * Usage: timer ncomp nexec nsub or timer ncomp nexec nsub regexp string [
- * answer [ sub ] ]
- *
- * The second form is for timing repetitions of a single test case. The first
- * form's test data is a compiled-in copy of the "tests" file. Ncomp, nexec,
- * nsub are how many times to do each regcomp, regexec, and regsub. The way
- * to time an operation individually is to do something like "timer 1 50 1".
- */
- #include <stdio.h>
-
- struct try {
- char *re, *str, *ans, *src, *dst;
- } tests[] = {
- #include "timer.t.h"
- {
- NULL, NULL, NULL, NULL, NULL
- }
- };
-
- #include <regexp.h>
-
- int errreport = 0; /* Report errors via errseen? */
- char *errseen = NULL; /* Error message. */
-
- char *progname;
-
- /* ARGSUSED */
- main(argc, argv)
- int argc;
- char *argv[];
- {
- int ncomp, nexec, nsub;
- struct try one;
- char dummy[512];
-
- if (argc < 4) {
- ncomp = 1;
- nexec = 1;
- nsub = 1;
- } else {
- ncomp = atoi(argv[1]);
- nexec = atoi(argv[2]);
- nsub = atoi(argv[3]);
- }
-
- progname = argv[0];
- if (argc > 5) {
- one.re = argv[4];
- one.str = argv[5];
- if (argc > 6)
- one.ans = argv[6];
- else
- one.ans = "y";
- if (argc > 7) {
- one.src = argv[7];
- one.dst = "xxx";
- } else {
- one.src = "x";
- one.dst = "x";
- }
- errreport = 1;
- try(one, ncomp, nexec, nsub);
- } else
- multiple(ncomp, nexec, nsub);
- exit(0);
- }
-
- void
- regerror(s)
- char *s;
- {
- if (errreport)
- errseen = s;
- else
- error(s, "");
- }
-
- #ifndef ERRAVAIL
- error(s1, s2)
- char *s1;
- char *s2;
- {
- fprintf(stderr, "regexp: ");
- fprintf(stderr, s1, s2);
- fprintf(stderr, "\n");
- exit(1);
- }
- #endif
-
- int lineno = 0;
-
- multiple(ncomp, nexec, nsub)
- int ncomp, nexec, nsub;
- {
- register int i;
- extern char *strchr();
-
- errreport = 1;
- for (i = 0; tests[i].re != NULL; i++) {
- lineno++;
- try(tests[i], ncomp, nexec, nsub);
- }
- }
-
- try(fields, ncomp, nexec, nsub)
- struct try fields;
- int ncomp, nexec, nsub;
- {
- regexp *r;
- char dbuf[BUFSIZ];
- register int i;
-
- errseen = NULL;
- r = regcomp(fields.re);
- if (r == NULL) {
- if (*fields.ans != 'c')
- complain("regcomp failure in `%s'", fields.re);
- return;
- }
- if (*fields.ans == 'c') {
- complain("unexpected regcomp success in `%s'", fields.re);
- free((char *) r);
- return;
- }
- for (i = ncomp - 1; i > 0; i--) {
- free((char *) r);
- r = regcomp(fields.re);
- }
- if (!regexec(r, fields.str)) {
- if (*fields.ans != 'n')
- complain("regexec failure in `%s'", "");
- free((char *) r);
- return;
- }
- if (*fields.ans == 'n') {
- complain("unexpected regexec success", "");
- free((char *) r);
- return;
- }
- for (i = nexec - 1; i > 0; i--)
- (void) regexec(r, fields.str);
- errseen = NULL;
- for (i = nsub; i > 0; i--)
- regsub(r, fields.src, dbuf);
- if (errseen != NULL) {
- complain("regsub complaint", "");
- free((char *) r);
- return;
- }
- if (strcmp(dbuf, fields.dst) != 0)
- complain("regsub result `%s' wrong", dbuf);
- free((char *) r);
- }
-
- complain(s1, s2)
- char *s1;
- char *s2;
- {
- fprintf(stderr, "try: %d: ", lineno);
- fprintf(stderr, s1, s2);
- fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : "");
- }
- SHAR_EOF
- cat << \SHAR_EOF > try.c
- /*
- * Simple test program for regexp(3) stuff. Knows about debugging hooks.
- *
- * Copyright (c) 1986 by University of Toronto. Written by Henry Spencer. Not
- * derived from licensed software.
- *
- * Permission is granted to anyone to use this software for any purpose on any
- * computer system, and to redistribute it freely, subject to the following
- * restrictions:
- *
- * 1. The author is not responsible for the consequences of use of this
- * software, no matter how awful, even if they arise from defects in it.
- *
- * 2. The origin of this software must not be misrepresented, either by explicit
- * claim or by omission.
- *
- * 3. Altered versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- *
- * Usage: try re [string [output [-]]] The re is compiled and dumped, regexeced
- * against the string, the result is applied to output using regsub(). The -
- * triggers a running narrative from regexec(). Dumping and narrative don't
- * happen unless DEBUG.
- *
- * If there are no arguments, stdin is assumed to be a stream of lines with five
- * fields: a r.e., a string to match it against, a result code, a source
- * string for regsub, and the proper result. Result codes are 'c' for
- * compile failure, 'y' for match success, 'n' for match failure. Field
- * separator is tab.
- */
- #include <stdio.h>
- #include <regexp.h>
-
- #ifdef ERRAVAIL
- char *progname;
- extern char *mkprogname();
- #endif
-
- #ifdef DEBUG
- extern int regnarrate;
- #endif
-
- char buf[BUFSIZ];
-
- int errreport = 0; /* Report errors via errseen? */
- char *errseen = NULL; /* Error message. */
- int status = 0; /* Exit status. */
-
- /* ARGSUSED */
- main(argc, argv)
- int argc;
- char *argv[];
- {
- regexp *r;
- int i;
-
- #ifdef ERRAVAIL
- progname = mkprogname(argv[0]);
- #endif
-
- if (argc == 1) {
- multiple();
- exit(status);
- }
- r = regcomp(argv[1]);
- if (r == NULL)
- error("regcomp failure", "");
- #ifdef DEBUG
- regdump(r);
- if (argc > 4)
- regnarrate++;
- #endif
- if (argc > 2) {
- i = regexec(r, argv[2]);
- printf("%d", i);
- for (i = 1; i < NSUBEXP; i++)
- if (r->startp[i] != NULL && r->endp[i] != NULL)
- printf(" \\%d", i);
- printf("\n");
- }
- if (argc > 3) {
- regsub(r, argv[3], buf);
- printf("%s\n", buf);
- }
- exit(status);
- }
-
- void
- regerror(s)
- char *s;
- {
- if (errreport)
- errseen = s;
- else
- error(s, "");
- }
-
- #ifndef ERRAVAIL
- error(s1, s2)
- char *s1;
- char *s2;
- {
- fprintf(stderr, "regexp: ");
- fprintf(stderr, s1, s2);
- fprintf(stderr, "\n");
- exit(1);
- }
- #endif
-
- int lineno;
-
- regexp badregexp; /* Implicit init to 0. */
-
- multiple()
- {
- char rbuf[BUFSIZ];
- char *field[5];
- char *scan;
- int i;
- regexp *r;
- extern char *strchr();
-
- errreport = 1;
- lineno = 0;
- while (fgets(rbuf, sizeof(rbuf), stdin) != NULL) {
- rbuf[strlen(rbuf) - 1] = '\0'; /* Dispense with \n. */
- lineno++;
- scan = rbuf;
- for (i = 0; i < 5; i++) {
- field[i] = scan;
- if (field[i] == NULL) {
- complain("bad testfile format", "");
- exit(1);
- }
- scan = strchr(scan, '\t');
- if (scan != NULL)
- *scan++ = '\0';
- }
- try(field);
- }
-
- /* And finish up with some internal testing... */
- lineno = 9990;
- errseen = NULL;
- if (regcomp((char *) NULL) != NULL || errseen == NULL)
- complain("regcomp(NULL) doesn't complain", "");
- lineno = 9991;
- errseen = NULL;
- if (regexec((regexp *) NULL, "foo") || errseen == NULL)
- complain("regexec(NULL, ...) doesn't complain", "");
- lineno = 9992;
- r = regcomp("foo");
- if (r == NULL) {
- complain("regcomp(\"foo\") fails", "");
- return;
- }
- lineno = 9993;
- errseen = NULL;
- if (regexec(r, (char *) NULL) || errseen == NULL)
- complain("regexec(..., NULL) doesn't complain", "");
- lineno = 9994;
- errseen = NULL;
- regsub((regexp *) NULL, "foo", rbuf);
- if (errseen == NULL)
- complain("regsub(NULL, ..., ...) doesn't complain", "");
- lineno = 9995;
- errseen = NULL;
- regsub(r, (char *) NULL, rbuf);
- if (errseen == NULL)
- complain("regsub(..., NULL, ...) doesn't complain", "");
- lineno = 9996;
- errseen = NULL;
- regsub(r, "foo", (char *) NULL);
- if (errseen == NULL)
- complain("regsub(..., ..., NULL) doesn't complain", "");
- lineno = 9997;
- errseen = NULL;
- if (regexec(&badregexp, "foo") || errseen == NULL)
- complain("regexec(nonsense, ...) doesn't complain", "");
- lineno = 9998;
- errseen = NULL;
- regsub(&badregexp, "foo", rbuf);
- if (errseen == NULL)
- complain("regsub(nonsense, ..., ...) doesn't complain", "");
- }
-
- try(fields)
- char **fields;
- {
- regexp *r;
- char dbuf[BUFSIZ];
-
- errseen = NULL;
- r = regcomp(fields[0]);
- if (r == NULL) {
- if (*fields[2] != 'c')
- complain("regcomp failure in `%s'", fields[0]);
- return;
- }
- if (*fields[2] == 'c') {
- complain("unexpected regcomp success in `%s'", fields[0]);
- free((char *) r);
- return;
- }
- if (!regexec(r, fields[1])) {
- if (*fields[2] != 'n')
- complain("regexec failure in `%s'", "");
- free((char *) r);
- return;
- }
- if (*fields[2] == 'n') {
- complain("unexpected regexec success", "");
- free((char *) r);
- return;
- }
- errseen = NULL;
- regsub(r, fields[3], dbuf);
- if (errseen != NULL) {
- complain("regsub complaint", "");
- free((char *) r);
- return;
- }
- if (strcmp(dbuf, fields[4]) != 0)
- complain("regsub result `%s' wrong", dbuf);
- free((char *) r);
- }
-
- complain(s1, s2)
- char *s1;
- char *s2;
- {
- fprintf(stderr, "try: %d: ", lineno);
- fprintf(stderr, s1, s2);
- fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : "");
- status = 1;
- }
- SHAR_EOF
- # End of shell archive
- exit 0
- --
- Bob Page, U of Lowell CS Dept. page@swan.ulowell.edu ulowell!page
- Have five nice days.
-